/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.quality; import java.io.*; import java.net.*; import java.util.*; import net.nutch.io.*; import net.nutch.searcher.*; import net.nutch.net.protocols.http.*; import net.nutch.quality.dynamic.*; /********************************************* * The PageExtractor creates a PageDescription from the * indicated file, then uses it to extract the info * from a downloaded HTML page. * *********************************************/ public class PageExtractor { /** * Lets us abstract the differences between a remote * search engine and Nutch */ public static interface IExtractor { ArrayList applyQuery(String query) throws IOException; } /** * An IExtractor wrapper for PageExtractor */ public static class RemotePageExtractor implements IExtractor { PageExtractor pageExtractor; /** */ public RemotePageExtractor(File pageDesc, String userAgent, boolean debug) throws IOException, ParseException { pageExtractor = new PageExtractor(pageDesc, userAgent, debug); } /** * The Remote Engine searcher will return a list of * HashMap items, each of which could hold a number of * fields. We're only interested in the "href" one. */ public ArrayList applyQuery(String query) throws IOException { ArrayList results = pageExtractor.applyQuery(query); if (results == null) { return results; } ArrayList strResults = new ArrayList(); for (Iterator it = results.iterator(); it.hasNext(); ) { HashMap hashmap = (HashMap) it.next(); String val = (String) hashmap.get("href"); strResults.add(val); } return strResults; } } /** * A local segment-searcher that will return search queries */ public static class NutchExtractor implements IExtractor { private NutchBean searcher; /** */ public NutchExtractor(String dir) throws IOException { searcher = new NutchBean(new File(dir)); } /** */ public ArrayList applyQuery(String queryStr) throws IOException { ArrayList results = new ArrayList(); Query query = Query.parse(queryStr); Hits hits = searcher.search(query, 10); long max = Math.min(hits.getTotal(), 10); for (int i = 0; i < max; i++) { HitDetails details = searcher.getDetails(hits.getHit(i)); results.add(details.getValue("url")); } return results; } } boolean debug; String userAgent; PageDescription desc; Http http; URL url; /** */ public PageExtractor(File pageDesc, String userAgent, boolean debug) throws IOException, ParseException { this.debug = debug; this.userAgent = userAgent; this.http = new Http(); http.setAgentString(userAgent); http.setAgentEmail(""); http.setTimeout(10000); InputStream in = new FileInputStream(pageDesc); try { this.desc = new PageDescription(in); desc.parse(); } finally { in.close(); } } /** * Apply the query, and parse out the results using the Page * Description. */ public ArrayList applyQuery(String query) throws IOException { ArrayList interprets = desc.getInterprets(); HashMap curInterpret = null; String page = getPage(query); if (debug) { System.err.println(page); } String interpretRegion = null; if (interprets.size() == 0) { interpretRegion = getResultList(page, new HashMap()); } else { for (Iterator it = interprets.iterator(); it.hasNext(); ) { curInterpret = (HashMap) it.next(); interpretRegion = getResultList(page, curInterpret); if (interpretRegion != null) { break; } } } // // Apply the interpret directive to the found region // if (interpretRegion != null) { ArrayList items = new ArrayList(); String itemStart = (String) curInterpret.get("resultitemstart"); boolean trimItemStart = !"true".equalsIgnoreCase((String) curInterpret.get("keepitemstart")); if (itemStart == null) { itemStart = "HREF="; } String itemEnd = (String) curInterpret.get("resultitemend"); boolean trimItemEnd = false; if (itemEnd == null) { itemEnd = itemStart; trimItemEnd = true; } // // Go through the content, looking for "itemStart" strings. // for (int start = page.indexOf(itemStart); start != -1; start = page.indexOf(itemStart, start)) { int itemEndIndex; if (trimItemStart) { start += itemStart.length(); itemEndIndex = page.indexOf(itemEnd, start); } else { itemEndIndex = page.indexOf(itemEnd, start+itemStart.length()); } if (itemEndIndex < 0) { itemEndIndex = page.length(); } else if (!trimItemEnd) { itemEndIndex += itemEnd.length(); } String resultItem = page.substring(start, itemEndIndex).trim(); items.add(parseResultItem(resultItem, curInterpret)); start = itemEndIndex; } return items; } return null; } /** * This uses the query string and the PageDescriptor to * contact the server and fetch a page of content. This * page is returned as a String. * * We will still need to extract the relevant fields. */ private String getPage(String query) throws IOException { // First, build the HTTP Connection HashMap values = desc.getValues(); String action = (String) values.get("action"); String method = (String) values.get("method"); String fullQuery = buildQueryString(query); this.url = new URL(action + fullQuery); //HttpURLConnection con = (HttpURLConnection) url.openConnection(); // How to handle 'method' here? try { return new String(http.getResponse(url).getContent()); } catch (HttpException e) { throw new IOException("HttpException: " + e.getMessage()); } } /** */ private String buildQueryString(String query) { StringBuffer queryString = new StringBuffer(); queryString.append("?"); ArrayList inputs = desc.getInputs(); int count = 0; for (Iterator it = inputs.iterator(); it.hasNext(); count++) { HashMap input = (HashMap) it.next(); if ("browser".equals((String) input.get("mode"))) { continue; // not for us } String name = (String) input.get("name"); if (name == null) { throw new RuntimeException("input has no name: " + input); } if (count != 0) { queryString.append("&"); } queryString.append(name); queryString.append("="); if (input.containsKey("user")) { try { queryString.append(URLEncoder.encode(query, "UTF-8")); } catch (UnsupportedEncodingException uee) { } } else { String value = (String) input.get("value"); if (value != null) { queryString.append(value); } } } return queryString.toString(); } /** */ private HashMap parseResultItem(String html, HashMap interpret) { String extractArg = (String) interpret.get("extractarg"); HashMap item = new HashMap(); item.put("html", html.trim()); int hrefStart = indexOfIgnoreCase("href=", html, 0); if (hrefStart != -1) { int hrefEnd = html.indexOf(">", hrefStart+5); if (hrefEnd != -1) { String href = html.substring(hrefStart+5,hrefEnd); href = trimQuotes(href, '\"'); href = trimQuotes(href, '\''); href = href.trim(); if (!href.startsWith("http:")) { try { href = new URL(url, href).toString(); } catch (MalformedURLException e) { } } // // Sometimes what we want is embedded in another // URL. Use this to extract it. // if (extractArg != null) { int argIndex = href.indexOf("?" + extractArg + "="); if (argIndex < 0) { argIndex = href.indexOf("&" + extractArg + "="); } if (argIndex >= 0) { int end = href.indexOf("&", argIndex + 1); if (end < 0) { end = href.length(); } href = href.substring(argIndex + extractArg.length() + 2, end); // Remove escaped chars, if any try { href = URLDecoder.decode(href, "utf-8"); } catch (UnsupportedEncodingException uee) { uee.printStackTrace(); } } } item.put("href", href.trim()); int anchorStart = hrefEnd+1; int anchorEnd = indexOfIgnoreCase("</a>", html, anchorStart); if (anchorEnd != -1) item.put("anchor", html.substring(anchorStart,anchorEnd).trim()); } } getItemSection(item, html, "relevance", interpret, "relevancestart", "relevanceend"); getItemSection(item, html, "price", interpret, "pricestart", "priceend"); getItemSection(item, html, "avail", interpret, "availstart", "availend"); getItemSection(item, html, "date", interpret, "datestart", "dateend"); getItemSection(item, html, "name", interpret, "namestart", "nameend"); getItemSection(item, html, "email", interpret, "emailstart", "emailend"); return item; } /** */ private void getItemSection(HashMap item, String html, String name, HashMap interpret, String start, String end) { String section = getSection(html, interpret, start, end); if (section != null) { item.put(name, removeTags(section).trim()); } } /** */ private String getSection(String text, HashMap interpret, String startName, String endName) { String start = (String) interpret.get(startName); String end = (String) interpret.get(endName); if (start != null && end != null) { int startIndex = text.indexOf(start); if (startIndex != -1) { int endIndex = text.indexOf(end, startIndex + start.length()); if (endIndex != -1) return text.substring(startIndex+start.length(), endIndex); } } return null; } /** */ private int indexOfIgnoreCase(String pattern, String text, int start) { int patternLength = pattern.length(); int end = text.length() - patternLength; for (int i = start; i <= end; i++) { if (text.regionMatches(true, i, pattern, 0, patternLength)) return i; } return -1; } /** */ private String trimQuotes(String href, char quote) { int quoteStart = href.indexOf(quote); if (quoteStart != -1) { int quoteEnd = href.indexOf(quote, quoteStart+1); if (quoteEnd != -1) return href.substring(quoteStart+1, quoteEnd); } return href; } /** */ private String removeTags(String html) { StringBuffer result = new StringBuffer(); int start = 0; for (int i = html.indexOf('<'); i >= 0; i = html.indexOf('<', start)) { int j = html.indexOf('>', i+1); if (j < 0) break; result.append(html.substring(start, i)); start = j+1; } if (start == 0) return html; result.append(html.substring(start, html.length())); return result.toString(); } /** * Apply an Interpret set against the page contents. */ private String getResultList(String page, HashMap interpret) { String start = (String) interpret.get("resultliststart"); String end = (String) interpret.get("resultlistend"); int startIndex = 0; int endIndex = page.length(); if (start != null) { if ((startIndex = page.indexOf(start)) < 0) { return null; } } if (end != null) { if ((endIndex = page.indexOf(end, startIndex)) < 0) { return null; } } return page.substring(startIndex, endIndex); } /** * We emit stats */ public void emitStats() { HashMap descValues = desc.getValues(); ArrayList inputs = desc.getInputs(); ArrayList interprets = desc.getInterprets(); System.out.println("Plugin name: " + (String) descValues.get("name")); System.out.println("Plugin URL: " + (String) descValues.get("url")); System.out.println("--------------------------------------------"); for (Iterator it = descValues.keySet().iterator(); it.hasNext(); ) { String key = (String) it.next(); System.out.println(" " + key + ": " + (String) descValues.get(key)); } System.out.println(); for (Iterator listKeys = inputs.iterator(); listKeys.hasNext(); ) { System.out.println("Inputs:"); HashMap curValues = (HashMap) listKeys.next(); for (Iterator it = curValues.keySet().iterator(); it.hasNext(); ) { String key = (String) it.next(); System.out.println(" " + key + ": " + (String) curValues.get(key)); } System.out.println(); } System.out.println(); for (Iterator listKeys = interprets.iterator(); listKeys.hasNext(); ) { System.out.println("Interprets:"); HashMap curValues = (HashMap) listKeys.next(); for (Iterator it = curValues.keySet().iterator(); it.hasNext(); ) { String key = (String) it.next(); System.out.println(" " + key + ": " + (String) curValues.get(key)); } System.out.println(); } } /** */ public static void main(String argv[]) throws IOException, ParseException { if (argv.length < 3) { System.out.println("Usage: java net.nutch.quality.PageExtractor <pageDesc> <userAgent> <query> [-debug]"); return; } String pageDesc = argv[0]; String userAgent = argv[1]; String query = argv[2]; boolean debug = false; if (argv.length > 3) { if ("-debug".equals(argv[3])) { debug = true; } } PageExtractor extractor = new PageExtractor(new File(pageDesc), userAgent, debug); ArrayList outs = extractor.applyQuery(query); if (outs == null) { System.out.println("Sorry, no results"); } else { System.out.println("Number items: " + outs.size()); System.out.println(); for (Iterator it = outs.iterator(); it.hasNext(); ) { HashMap hashmap = (HashMap) it.next(); String hit = (String) hashmap.get("href"); System.out.println(hit); } } } }